#coding:utf8
import urllib.request
import re
import os

class SpiderHelper(object):
    def __init__(self, host = "http://www.liaoxuefeng.com"):
        self.HOST = host

    '''
    根据url获取指定页面html
    '''
    def downloadPage(self, url):
        fp = urllib.request.urlopen(url)
        buf = fp.read()
        html = buf.decode("utf8")
        fp.close()
        return html


    '''
    获取javascript资源
    '''
    def getJs(self, html):
        pass

    '''
    获取css资源
    '''
    def getCss(self, html):
        pass

    '''
    下载资源文件
    '''
    def getResource(self):
        pass

    '''
    根据html获取菜单
    '''
    def getMenu(self, html):
        # 导航部分正则表达式
        p_menu = r'<ul class="uk\-nav uk\-nav\-side" style="margin\-right\:\-15px;">(.*?)</ul>'
        r_menu = re.compile(p_menu, re.S)
        strMenu = r_menu.findall(html)

        # print(strMenu[0])

        menus = strMenu[0]
        menuItems = menus.split("\n    \n        ")
        # print(menuItems)

        # 导航名字和连接正则表达式
        p_menu_dict = r'<li id="(.*?)"( style="margin-left\:\dem;")?>\s{0,16}<a href="(.*?)">(.*?)</a>\s{0,16}</li>'
        r_menu_dict = re.compile(p_menu_dict, re.S)

        menu_list = []
        for menu in menuItems:
            menu_item = r_menu_dict.findall(menu)
            menu_list.append(menu_item)

        return menu_list

    '''
    获得文章内容
    '''
    def getContent(self, html):
        pass

    '''
    从html获取图片地址
    '''
    def getImages(self, html):
        imgList = []
        re_img = r'<img src="/files/attachments/(\w*?)/(\w*?)" alt=".*?">'
        imgList = re.findall(re_img, html)

        return imgList

    '''
    下载图片
    '''
    def downImg(self, url, saveFile):
        urllib.request.urlretrieve(url, saveFile)

    '''
    下载图片资源
    '''
    def downloadImages(self, imgList):
        BASE_DIR = 'files/attachments/' + imgList[0][0]
        if not os.path.isdir(BASE_DIR):
            os.makedirs(BASE_DIR)

        for img in imgList:
            print(img)
            url = self.HOST + '/' + BASE_DIR + '/' + img[1]
            savePath = BASE_DIR + '/' + img[1] + '.png'
            print(url)
            print(savePath)
            self.downImg(url, savePath)

    '''
    把网页写到文件中
    '''
    def saveToFile(self, html, path):
        with open(path, 'w', encoding='utf8') as f:
            f.write(html)

    '''
    下载所有的网页文件
    '''
    def downloadAllPages(self, menuList):
        for item in menuList:
            t_item = item[0]
            print("downloading: " + t_item[3])
            html = self.downloadPage(self.HOST + t_item[2])
            if t_item[1] == '':
                os.mkdir('.' + t_item[2])
            self.saveToFile(html, '.' + t_item[2] + ".html")

if __name__ == "__main__":
    pass